import pandas as pd
import numpy as np
from sklearn import metrics
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import joblib
import os
import warnings
warnings.filterwarnings('ignore')
plt.rcParams['font.sans-serif']=['SimHei'] #显示中文
# 设置Seaborn样式
sns.set(font='Microsoft YaHei')
d:\ProgramData\Anaconda3\envs\pytorch\lib\site-packages\numpy\_distributor_init.py:32: UserWarning: loaded more than 1 DLL from .libs: d:\ProgramData\Anaconda3\envs\pytorch\lib\site-packages\numpy\.libs\libopenblas.WCDJNK7YVMPZQ2ME2ZZHJJRJ3JIKNDB7.gfortran-win_amd64.dll d:\ProgramData\Anaconda3\envs\pytorch\lib\site-packages\numpy\.libs\libopenblas.XWYDX2IKJW2NMTWSFYNGFUWKQU3LYTCZ.gfortran-win_amd64.dll stacklevel=1)
# 获取字典保存各个模型的最终结果
result_model = {}
output_file = 'output/TN_NH3_N2O'
input_file = 'data/TN_NH3_N2O'
model_save_file = 'output/TN_NH3_N2O/model'
os.makedirs(output_file, exist_ok=True)
os.makedirs(model_save_file, exist_ok=True)
# target = ['TN loss (%)', 'NH3-N loss (%)', 'N2O-N loss (%)']
target = 'TN loss (%)' # 要预测啥就换个名字
# target = ['EF %', 'LN(EF)', 'LNR(N2O)', 'N2O rate(kg N ha-1 y-1)']
data_path = f'{input_file}/data_for_{target}.csv'
data_tree_path = f'{input_file}/data_for_tree_{target}.csv'
model_performance_path = f'{output_file}/Model_{target}.html'
model_performance_json = f'{output_file}/result_model_{target}.json'
data_all_ef = pd.read_csv(data_path)
X_all = data_all_ef.iloc[:, :-1]
y_all = data_all_ef.iloc[:, -1]
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.2, random_state=2023)
y_train = y_train.reset_index(drop=True)
## 为了正确评估模型性能,将数据划分为训练集和测试集,并在训练集上训练模型,在测试集上验证模型性能。
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
num_folds = 5
kf = KFold(n_splits=num_folds, shuffle=True)
input_cols = X_all.columns.tolist()
data_all_tree = pd.read_csv(data_tree_path)
X_allt = data_all_tree.iloc[:, :-1]
y_allt = data_all_tree.iloc[:, -1]
X_traint, X_testt, y_traint, y_testt = train_test_split(X_all, y_all, test_size=0.2, random_state=2023)
y_traint = y_traint.reset_index(drop=True)
## 为了正确评估模型性能,将数据划分为训练集和测试集,并在训练集上训练模型,在测试集上验证模型性能。
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
num_folds = 5
kf = KFold(n_splits=num_folds, shuffle=True)
X_train.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 450 entries, 466 to 537 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 material_0 450 non-null int64 1 initial CN(%) 450 non-null float64 2 initial moisture content(%) 450 non-null float64 3 initial pH 450 non-null float64 4 material_1 450 non-null int64 5 Excipients 450 non-null int64 6 initial TN(%) 450 non-null float64 7 initial TC(%) 450 non-null float64 8 Additive Species 450 non-null int64 dtypes: float64(5), int64(4) memory usage: 35.2 KB
X_train.describe()
| material_0 | initial CN(%) | initial moisture content(%) | initial pH | material_1 | Excipients | initial TN(%) | initial TC(%) | Additive Species | |
|---|---|---|---|---|---|---|---|---|---|
| count | 450.000000 | 450.000000 | 450.000000 | 450.000000 | 450.000000 | 450.000000 | 450.000000 | 450.000000 | 450.000000 |
| mean | 2.931111 | 15.603382 | 34.574044 | 4.088483 | 3.382222 | 27.482222 | 0.975466 | 23.110170 | 3.066667 |
| std | 0.790907 | 13.498014 | 32.949480 | 4.245029 | 1.749488 | 13.073746 | 1.740370 | 23.320782 | 1.180766 |
| min | 0.000000 | -1.000000 | -1.000000 | -1.000000 | 0.000000 | 0.000000 | -1.000000 | -1.000000 | 0.000000 |
| 25% | 3.000000 | -1.000000 | -1.000000 | -1.000000 | 3.000000 | 17.000000 | -1.000000 | -1.000000 | 3.000000 |
| 50% | 3.000000 | 17.980000 | 56.230000 | 6.657690 | 4.000000 | 34.000000 | 1.358654 | 30.325000 | 3.000000 |
| 75% | 3.000000 | 25.000000 | 65.000000 | 7.795000 | 5.000000 | 39.000000 | 2.070000 | 39.800000 | 4.000000 |
| max | 5.000000 | 53.734940 | 89.800000 | 10.700000 | 5.000000 | 39.000000 | 13.400000 | 196.750000 | 4.000000 |
X_train
| material_0 | initial CN(%) | initial moisture content(%) | initial pH | material_1 | Excipients | initial TN(%) | initial TC(%) | Additive Species | |
|---|---|---|---|---|---|---|---|---|---|
| 466 | 3 | 20.0 | 65.0 | -1.00000 | 3 | 5 | -1.00 | -1.00 | 4 |
| 148 | 2 | -1.0 | -1.0 | -1.00000 | 5 | 39 | 1.40 | -1.00 | 3 |
| 260 | 3 | 25.0 | 60.0 | 7.60000 | 3 | 18 | 7.87 | 196.75 | 4 |
| 183 | 3 | -1.0 | -1.0 | 6.85618 | 4 | 39 | -1.00 | -1.00 | 1 |
| 450 | 3 | 15.0 | 75.0 | -1.00000 | 4 | 6 | 2.40 | 35.30 | 4 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 470 | 3 | 32.0 | 49.6 | 7.84000 | 3 | 23 | 1.15 | 37.30 | 4 |
| 52 | 4 | -1.0 | 60.5 | -1.00000 | 5 | 39 | -1.00 | -1.00 | 3 |
| 515 | 3 | 25.0 | -1.0 | -1.00000 | 3 | 23 | -1.00 | -1.00 | 4 |
| 454 | 3 | 18.0 | 70.0 | -1.00000 | 4 | 6 | 2.10 | 38.00 | 4 |
| 537 | 2 | 30.0 | 60.6 | -1.00000 | 5 | 14 | 1.41 | 36.80 | 4 |
450 rows × 9 columns
y_train.isnull().any()
False
print(X_train.isnull().any())
material_0 False initial CN(%) False initial moisture content(%) False initial pH False material_1 False Excipients False initial TN(%) False initial TC(%) False Additive Species False dtype: bool
np.isnan(X_train).any()
material_0 False initial CN(%) False initial moisture content(%) False initial pH False material_1 False Excipients False initial TN(%) False initial TC(%) False Additive Species False dtype: bool
from sklearn.ensemble import RandomForestRegressor
rf_model = RandomForestRegressor(n_estimators=1000, criterion='mse',
max_depth=10, min_samples_split=2,
min_samples_leaf=1, min_weight_fraction_leaf=0.0,
max_features='auto', max_leaf_nodes=None,
bootstrap=True, oob_score=False,
n_jobs=1, random_state=None,
verbose=0, warm_start=False)
rf_preds = np.zeros(X_train.shape[0])
for train_index, valid_index in kf.split(X_train):
x_tr, x_va = X_train.iloc[train_index], X_train.iloc[valid_index]
y_tr, y_va = y_train.iloc[train_index], y_train.iloc[valid_index]
print(x_tr.shape, y_tr.shape)
rf_model.fit(x_tr, y_tr)
rf_pred_valid = rf_model.predict(x_va)
rf_preds[valid_index] = rf_pred_valid
print('The rmse of the RFRegression is:',metrics.mean_squared_error(y_va,rf_pred_valid))
x = list(range(0, len(y_va)))
plt.figure(figsize=(40,10))
plt.plot(x, y_va, 'g', label='ground-turth')
plt.plot(x, rf_pred_valid, 'r', label='xgb predict')
plt.legend(loc='best')
plt.title('xgb-yield')
plt.xlabel('sample')
plt.ylabel('Yield_lnRR')
plt.show()
(360, 9) (360,) The rmse of the RFRegression is: 126.0314666685265
(360, 9) (360,) The rmse of the RFRegression is: 145.5660239711958
(360, 9) (360,) The rmse of the RFRegression is: 131.31412750536043
(360, 9) (360,) The rmse of the RFRegression is: 119.95701259689821
(360, 9) (360,) The rmse of the RFRegression is: 252.0317008134364
rf_pred_test = rf_model.predict(X_test)
print('The rmse of the test is:',metrics.mean_squared_error(y_test,rf_pred_test))
x = list(range(0, len(y_test)))
plt.figure(figsize=(40,10))
plt.plot(x, y_test, 'g', label='ground-turth')
plt.plot(x, rf_pred_test, 'r', label='rf predict')
plt.legend(loc='best')
plt.title('rf-yield')
plt.xlabel('sample')
plt.ylabel('Yield_lnRR')
plt.show()
joblib.dump(rf_model, f'{model_save_file}/rf_model.pkl')
result_model['RandomForest(k)'] = metrics.mean_squared_error(y_test,rf_pred_test)
The rmse of the test is: 152.48951026880778
import xgboost as xgb
xgb_preds = np.zeros(X_train.shape[0])
kf = KFold(n_splits=num_folds, shuffle=True)
xgb_model = xgb.XGBRegressor(max_depth=8,
learning_rate=0.1,
n_estimators=300,
n_jobs=4,
colsample_bytree=0.8,
subsample=0.8,
random_state=32,
tree_method='hist'
)
for train_index, valid_index in kf.split(X_train):
x_tr, x_va = X_train.iloc[train_index], X_train.iloc[valid_index]
y_tr, y_va = y_train.iloc[train_index], y_train.iloc[valid_index]
print(x_tr.shape, y_tr.shape)
xgb_model.fit(x_tr, y_tr)
xgb_pred_valid = xgb_model.predict(x_va)
xgb_preds[valid_index] = xgb_pred_valid
# print('The accuracy of the Logistic Regression is:',metrics.accuracy_score(y_valid,xgb_pred_valid))
print('The rmse of the XgbRegression is:',metrics.mean_squared_error(y_va,xgb_pred_valid))
x = list(range(0, len(y_va)))
plt.figure(figsize=(40,10))
plt.plot(x, y_va, 'g', label='ground-turth')
plt.plot(x, xgb_pred_valid, 'r', label='xgb predict')
plt.legend(loc='best')
plt.title('xgb-yield')
plt.xlabel('sample')
plt.ylabel('Yield_lnRR')
plt.show()
(360, 9) (360,) The rmse of the XgbRegression is: 150.4376191929306
(360, 9) (360,) The rmse of the XgbRegression is: 157.80004523008589
(360, 9) (360,) The rmse of the XgbRegression is: 106.71409876836324
(360, 9) (360,) The rmse of the XgbRegression is: 164.3025653761061
(360, 9) (360,) The rmse of the XgbRegression is: 167.06615300896544
xgb_pred_test = xgb_model.predict(X_test)
print('The rmse of the XgbRegression is:',metrics.mean_squared_error(y_test, xgb_pred_test))
x = list(range(0, len(y_test)))
plt.figure(figsize=(40,10))
plt.plot(x, y_test, 'g', label='ground-turth')
plt.plot(x, xgb_pred_test, 'r', label='xgb predict')
plt.legend(loc='best')
plt.title('xgb-yield')
plt.xlabel('sample')
plt.ylabel('Yield_lnRR')
plt.show()
joblib.dump(xgb_model, f'{model_save_file}/xgb_model.pkl')
result_model['XGBoost(k)'] = metrics.mean_squared_error(y_test,xgb_pred_test)
The rmse of the XgbRegression is: 161.20656664940898
xgb_pred_test
array([84.90975 , 40.411938 , 84.90975 , 40.36061 , 30.81254 ,
58.17785 , 40.813774 , 47.137566 , 36.048138 , 34.232754 ,
36.048138 , 24.441133 , 17.381983 , 18.252796 , 4.251944 ,
6.372212 , 42.83542 , 40.505272 , 17.315933 , 36.048138 ,
45.864594 , 38.36129 , 19.748665 , 48.605034 , 32.474518 ,
29.898304 , 8.696232 , 38.491615 , 8.650021 , 36.20542 ,
41.521667 , 26.360083 , 24.259657 , 40.505272 , 18.623236 ,
35.769142 , 40.18205 , 20.484081 , 9.788815 , 18.752897 ,
25.451082 , 33.60631 , 35.769142 , 15.704942 , 24.441133 ,
42.301727 , 23.094816 , 17.997398 , 31.184462 , 84.41048 ,
16.69752 , 40.505272 , 24.441133 , 33.246864 , 24.303764 ,
33.913746 , 37.34529 , 30.070019 , 30.56996 , 28.185186 ,
12.755292 , 16.744617 , 18.24543 , 13.301644 , 26.466618 ,
40.145653 , 35.153057 , 36.01605 , 34.90764 , 15.838131 ,
21.322765 , 0.3356682, 50.4274 , 28.103344 , 20.186169 ,
23.172392 , 24.441133 , 47.137566 , 47.76451 , 28.274157 ,
22.182278 , 28.796017 , 26.188715 , 34.746414 , 22.084042 ,
25.610353 , 18.504105 , 44.952965 , 46.569164 , 40.813774 ,
34.746414 , 18.392904 , 40.65509 , 40.07162 , 11.603221 ,
42.74766 , 23.828777 , 80.21579 , 27.678026 , 33.407772 ,
19.191181 , 30.66869 , 32.336376 , 7.084119 , 24.303764 ,
80.21579 , 28.97032 , 0.4624677, 19.506504 , 34.489136 ,
28.275036 , 8.650021 , 36.209553 ], dtype=float32)
import lightgbm as lgb
lgb_preds = np.zeros(X_traint.shape[0])
for i, (train_index, valid_index) in enumerate(kf.split(X_traint, y_traint)):
print('************************************ {} ************************************'.format(str(i+1)))
X_tr, y_tr, X_va, y_va = X_traint.iloc[train_index], \
y_traint.iloc[train_index], X_traint.iloc[valid_index], y_traint.iloc[valid_index]
params_lgb = {
'boosting_type': 'gbdt',
'objective': 'regression',
'learning_rate': 0.1,
'n_estimators': 300,
'metric': 'root_mean_squared_error',
'min_child_weight': 1e-3,
'min_child_samples': 10,
'num_leaves': 31,
'max_depth': -1,
'seed': 2023,
'verbose': -1,
}
lgb_model = lgb.LGBMRegressor(**params_lgb)
lgb_model.fit(X_tr, y_tr, eval_set=[(X_tr, y_tr), (X_va, y_va)], eval_metric=['mae','rmse'])
lgb_val_pred = lgb_model.predict(X_va)
lgb_preds[valid_index] = lgb_val_pred
print('The rmse of the LgbRegression is:',metrics.mean_squared_error(y_va,lgb_val_pred))
x = list(range(0, len(y_va)))
plt.figure(figsize=(40,10))
plt.plot(x, y_va, 'g', label='ground-turth')
plt.plot(x, lgb_val_pred, 'r', label='lgb predict')
plt.legend(loc='best')
plt.title('lgb-yield')
plt.xlabel('sample')
plt.ylabel('Yield_lnRR')
plt.show()
************************************ 1 ************************************ The rmse of the LgbRegression is: 144.15979510245958
************************************ 2 ************************************ The rmse of the LgbRegression is: 131.31738146739463
************************************ 3 ************************************ The rmse of the LgbRegression is: 146.76703142514518
************************************ 4 ************************************ The rmse of the LgbRegression is: 107.73408809186085
************************************ 5 ************************************ The rmse of the LgbRegression is: 203.50332298250953
lgb_pred_test = lgb_model.predict(X_testt)
print('The rmse of the test is:',metrics.mean_squared_error(y_testt,lgb_pred_test))
x = list(range(0, len(y_testt)))
plt.figure(figsize=(40,10))
plt.plot(x[:100], y_testt[:100], 'g', label='ground-turth')
plt.plot(x[:100], lgb_pred_test[:100], 'r', label='lgb predict')
plt.legend(loc='best')
plt.title('lgb-yield')
plt.xlabel('sample')
plt.ylabel('Yield_lnRR')
plt.show()
joblib.dump(lgb_model, f'{model_save_file}/lgb_model.pkl')
result_model['Lightgbm(k)'] = metrics.mean_squared_error(y_testt,lgb_pred_test)
The rmse of the test is: 168.0038498088402
from catboost import CatBoostRegressor
params = {
'learning_rate': 0.05,
'loss_function': "RMSE",
'eval_metric': "RMSE", # CrossEntropy
'depth': 8,
'min_data_in_leaf': 20,
'random_seed': 42,
'logging_level': 'Silent',
'use_best_model': True,
'one_hot_max_size': 5, #类别数量多于此数将使用ordered target statistics编码方法,默认值为2。
'boosting_type':"Ordered", #Ordered 或者Plain,数据量较少时建议使用Ordered,训练更慢但能够缓解梯度估计偏差。
'max_ctr_complexity': 2, #特征组合的最大特征数量,设置为1取消特征组合,设置为2只做两个特征的组合,默认为4。
'nan_mode': 'Min'
}
iterations = 400
early_stopping_rounds = 200
ctb_model = CatBoostRegressor(iterations=iterations,
early_stopping_rounds = early_stopping_rounds,
**params)
for i, (train_index, valid_index) in enumerate(kf.split(X_traint)):
print('************************************ {} ************************************'.format(str(i+1)))
x_tr, x_va = X_traint.iloc[train_index], X_traint.iloc[valid_index]
y_tr, y_va = y_traint.iloc[train_index], y_traint.iloc[valid_index]
print(x_tr.shape, y_tr.shape)
ctb_model.fit(x_tr, y_tr, eval_set=(x_va, y_va), verbose=0, early_stopping_rounds=1000)
ctb_pre= ctb_model.predict(x_va)
print('The rmse of the CatRegression is:',metrics.mean_squared_error(y_va,ctb_pre))
x = list(range(0, len(y_va)))
plt.figure(figsize=(40,10))
plt.plot(x, y_va, 'g', label='ground-turth')
plt.plot(x, ctb_pre, 'r', label='cat predict')
plt.legend(loc='best')
plt.title('cat-yield')
plt.xlabel('sample')
plt.ylabel('Yield_lnRR')
plt.show()
************************************ 1 ************************************ (360, 9) (360,) The rmse of the CatRegression is: 137.65860243967714
************************************ 2 ************************************ (360, 9) (360,) The rmse of the CatRegression is: 91.00799259127238
************************************ 3 ************************************ (360, 9) (360,) The rmse of the CatRegression is: 151.9945800734046
************************************ 4 ************************************ (360, 9) (360,) The rmse of the CatRegression is: 118.63014436668998
************************************ 5 ************************************ (360, 9) (360,) The rmse of the CatRegression is: 140.55848059764645
cat_pred_test = ctb_model.predict(X_testt)
print('The rmse of the test is:',metrics.mean_squared_error(y_testt,cat_pred_test))
x = list(range(0, len(y_testt)))
plt.figure(figsize=(40,10))
plt.plot(x, y_testt, 'g', label='ground-turth')
plt.plot(x, cat_pred_test, 'r', label='cat predict')
plt.legend(loc='best')
plt.title('cat-yield')
plt.xlabel('sample')
plt.ylabel('Yield_lnRR')
plt.show()
joblib.dump(ctb_model, f'{model_save_file}/ctb_model.pkl')
result_model['Catboost(k)'] = metrics.mean_squared_error(y_test,cat_pred_test)
The rmse of the test is: 134.20532181711567
from sklearn.linear_model import Ridge
from sklearn.pipeline import make_pipeline
ri_model = Ridge(alpha=0.5,fit_intercept=True)
lr_preds = np.zeros(X_train.shape[0])
for train_index, valid_index in kf.split(X_train, y_train):
x_tr, x_va = X_train.iloc[train_index], X_train.iloc[valid_index]
y_tr, y_va = y_train.iloc[train_index], y_train.iloc[valid_index]
print(x_tr.shape, y_tr.shape)
ri_model.fit(x_tr, y_tr)
lr_pred_valid = ri_model.predict(x_va)
lr_preds[valid_index] = lr_pred_valid
print('The rmse of the LR is:',metrics.mean_squared_error(y_va,lr_pred_valid))
x = list(range(0, len(y_va)))
plt.figure(figsize=(40,10))
plt.plot(x, y_va, 'g', label='ground-turth')
plt.plot(x, lr_pred_valid, 'r', label='lr predict')
plt.legend(loc='best')
plt.title('lr-yield')
plt.xlabel('sample')
plt.ylabel('Yield_lnRR')
plt.show()
(360, 9) (360,) The rmse of the LR is: 219.01021050096995
(360, 9) (360,) The rmse of the LR is: 426.9549340311016
(360, 9) (360,) The rmse of the LR is: 297.35225997165344
(360, 9) (360,) The rmse of the LR is: 294.53694723908524
(360, 9) (360,) The rmse of the LR is: 365.18053210044656
ri_pred_test = ri_model.predict(X_test)
print('The rmse of the test is:',metrics.mean_squared_error(y_test,ri_pred_test))
x = list(range(0, len(y_test)))
plt.figure(figsize=(40,10))
plt.plot(x, y_test, 'g', label='ground-turth')
plt.plot(x, ri_pred_test, 'r', label='lr predict')
plt.legend(loc='best')
plt.title('lr-yield')
plt.xlabel('sample')
plt.ylabel('Yield_lnRR')
plt.show()
joblib.dump(ri_model, f'{model_save_file}/ri_model.pkl')
result_model['RidgeRegression(k)'] = metrics.mean_squared_error(y_test,ri_pred_test)
The rmse of the test is: 326.04477768286097
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline
lr_model = LinearRegression(fit_intercept=True, normalize=False, copy_X=True, n_jobs=1)
lr_preds = np.zeros(X_train.shape[0])
for train_index, valid_index in kf.split(X_train, y_train):
x_tr, x_va = X_train.iloc[train_index], X_train.iloc[valid_index]
y_tr, y_va = y_train.iloc[train_index], y_train.iloc[valid_index]
print(x_tr.shape, y_tr.shape)
lr_model.fit(x_tr, y_tr)
lr_pred_valid = lr_model.predict(x_va)
lr_preds[valid_index] = lr_pred_valid
# print('The accuracy of the Logistic Regression is:',metrics.accuracy_score(y_valid,xgb_pred_valid))
print('The rmse of the LR is:',metrics.mean_squared_error(y_va,lr_pred_valid))
x = list(range(0, len(y_va)))
plt.figure(figsize=(40,10))
plt.plot(x, y_va, 'g', label='ground-turth')
plt.plot(x, lr_pred_valid, 'r', label='lr predict')
plt.legend(loc='best')
plt.title('lr-yield')
plt.xlabel('sample')
plt.ylabel('Yield_lnRR')
plt.show()
(360, 9) (360,) The rmse of the LR is: 314.1936119105932
(360, 9) (360,) The rmse of the LR is: 328.36971054412
(360, 9) (360,) The rmse of the LR is: 286.59978825058465
(360, 9) (360,) The rmse of the LR is: 335.67162987133275
(360, 9) (360,) The rmse of the LR is: 298.7482991436072
lr_pred_test = lr_model.predict(X_test)
print('The rmse of the test is:',metrics.mean_squared_error(y_test,lr_pred_test))
x = list(range(0, len(y_test)))
plt.figure(figsize=(40,10))
plt.plot(x, y_test, 'g', label='ground-turth')
plt.plot(x, lr_pred_test, 'r', label='lr predict')
plt.legend(loc='best')
plt.title('lr-yield')
plt.xlabel('sample')
plt.ylabel('Yield_lnRR')
plt.show()
joblib.dump(lr_model, f'{model_save_file}/lr_model.pkl')
result_model['LinearRegression(k)'] = metrics.mean_squared_error(y_test,lr_pred_test)
The rmse of the test is: 317.42579423069157
from sklearn.neural_network import MLPRegressor
mlp_model = MLPRegressor(solver='lbfgs',alpha=1e-5, hidden_layer_sizes=(40,40), max_iter=500, random_state=2023)
mlp_preds = np.zeros(X_train.shape[0])
for train_index, valid_index in kf.split(X_train, y_train):
x_tr, x_va = X_train.iloc[train_index], X_train.iloc[valid_index]
y_tr, y_va = y_train.iloc[train_index], y_train.iloc[valid_index]
print(x_tr.shape, y_tr.shape)
mlp_model.fit(x_tr, y_tr)
mlp_pred_valid = mlp_model.predict(x_va)
mlp_preds[valid_index] = mlp_pred_valid
# print('The accuracy of the Logistic Regression is:',metrics.accuracy_score(y_valid,xgb_pred_valid))
print('The rmse of the MLP is:',metrics.mean_squared_error(y_va,mlp_pred_valid))
x = list(range(0, len(y_va)))
plt.figure(figsize=(40,10))
plt.plot(x, y_va, 'g', label='ground-turth')
plt.plot(x, mlp_pred_valid, 'r', label='mlp predict')
plt.legend(loc='best')
plt.title('mlp-yield')
plt.xlabel('sample')
plt.ylabel('Yield_lnRR')
plt.show()
(360, 9) (360,) The rmse of the MLP is: 172.16619071151197
(360, 9) (360,) The rmse of the MLP is: 287.3069464332374
(360, 9) (360,) The rmse of the MLP is: 177.94632641403044
(360, 9) (360,) The rmse of the MLP is: 219.3194947093819
(360, 9) (360,) The rmse of the MLP is: 190.39058581948484
mlp_pred_test = mlp_model.predict(X_test)
print('The rmse of the test is:',metrics.mean_squared_error(y_test,mlp_pred_test))
x = list(range(0, len(y_test)))
plt.figure(figsize=(40,10))
plt.plot(x, y_test, 'g', label='ground-turth')
plt.plot(x, mlp_pred_test, 'r', label='mlp predict')
plt.legend(loc='best')
plt.title('mlp-yield')
plt.xlabel('sample')
plt.ylabel('Yield_lnRR')
plt.show()
joblib.dump(mlp_model, f'{model_save_file}/mlp_model.pkl')
result_model['MLP(k)'] = metrics.mean_squared_error(y_test,mlp_pred_test)
The rmse of the test is: 212.5785076065758
from sklearn.svm import SVR
svr_model = SVR(kernel ='rbf',
degree = 3,
coef0 = 0.0,
tol = 0.001,
C = 1.0,
epsilon = 0.1,
shrinking = True,
cache_size = 200,
verbose = False,
max_iter = -1)
svr_preds = np.zeros(X_train.shape[0])
for train_index, valid_index in kf.split(X_train, y_train):
x_tr, x_va = X_train.iloc[train_index], X_train.iloc[valid_index]
y_tr, y_va = y_train.iloc[train_index], y_train.iloc[valid_index]
print(x_tr.shape, y_tr.shape)
svr_model.fit(x_tr, y_tr)
svr_pred_valid = svr_model.predict(x_va)
svr_preds[valid_index] = svr_pred_valid
# print('The accuracy of the Logistic Regression is:',metrics.accuracy_score(y_valid,xgb_pred_valid))
print('The rmse of the LR is:',metrics.mean_squared_error(y_va,svr_pred_valid))
x = list(range(0, len(y_va)))
plt.figure(figsize=(40,10))
plt.plot(x, y_va, 'g', label='ground-turth')
plt.plot(x, svr_pred_valid, 'r', label='svr predict')
plt.legend(loc='best')
plt.title('svr-yield')
plt.xlabel('sample')
plt.ylabel('Yield_lnRR')
plt.show()
(360, 9) (360,) The rmse of the LR is: 336.94506431335725
(360, 9) (360,) The rmse of the LR is: 275.29736103784944
(360, 9) (360,) The rmse of the LR is: 281.4711931902122
(360, 9) (360,) The rmse of the LR is: 349.0051223555379
(360, 9) (360,) The rmse of the LR is: 387.8624151085627
svr_pred_test = svr_model.predict(X_test)
print('The rmse of the test is:',metrics.mean_squared_error(y_test,svr_pred_test))
x = list(range(0, len(y_test)))
plt.figure(figsize=(40,10))
plt.plot(x, y_test, 'g', label='ground-turth')
plt.plot(x, svr_pred_test, 'r', label='mlp predict')
plt.legend(loc='best')
plt.title('mlp-yield')
plt.xlabel('sample')
plt.ylabel('Yield_lnRR')
plt.show()
joblib.dump(svr_model, f'{model_save_file}/svr_model.pkl')
result_model['SVR(k)'] = metrics.mean_squared_error(y_test,svr_pred_test)
The rmse of the test is: 378.8414160194384
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF
# 创建高斯过程回归模型对象
gp_model = GaussianProcessRegressor(kernel=RBF())
gp_preds = np.zeros(X_train.shape[0])
for train_index, valid_index in kf.split(X_train, y_train):
x_tr, x_va = X_train.iloc[train_index], X_train.iloc[valid_index]
y_tr, y_va = y_train.iloc[train_index], y_train.iloc[valid_index]
print(x_tr.shape, y_tr.shape)
gp_model.fit(x_tr, y_tr)
gp_pred_valid = gp_model.predict(x_va)
gp_preds[valid_index] = gp_pred_valid
print('The rmse of the LR is:',metrics.mean_squared_error(y_va,gp_pred_valid))
x = list(range(0, len(y_va)))
plt.figure(figsize=(40,10))
plt.plot(x, y_va, 'g', label='ground-turth')
plt.plot(x, gp_pred_valid, 'r', label='gp predict')
plt.legend(loc='best')
plt.title('gp-yield')
plt.xlabel('sample')
plt.ylabel('Yield_lnRR')
plt.show()
(360, 9) (360,) The rmse of the LR is: 834.3952646708552
(360, 9) (360,) The rmse of the LR is: 754.7523866750222
(360, 9) (360,) The rmse of the LR is: 792.1270485352245
(360, 9) (360,) The rmse of the LR is: 548.6501813244878
(360, 9) (360,) The rmse of the LR is: 900.3367960162427
gp_pred_test = gp_model.predict(X_test)
print('The rmse of the test is:',metrics.mean_squared_error(y_test,gp_pred_test))
x = list(range(0, len(y_test)))
plt.figure(figsize=(40,10))
plt.plot(x, y_test, 'g', label='ground-turth')
plt.plot(x, gp_pred_test, 'r', label='mlp predict')
plt.legend(loc='best')
plt.title('mlp-yield')
plt.xlabel('sample')
plt.ylabel('Yield_lnRR')
plt.show()
joblib.dump(gp_model, f'{model_save_file}/gp_model.pkl')
result_model['GR(k)'] = metrics.mean_squared_error(y_test,gp_pred_test)
The rmse of the test is: 880.5795542654563
import json
# 将字典保存为 JSON 文件
with open(model_performance_json, 'w') as json_file:
json.dump(result_model, json_file)
import plotly.express as px
import plotly.io as pio
with open(model_performance_json, 'r') as json_file:
result_model = json.load(json_file)
result_model = dict(sorted(result_model.items(), key=lambda item: item[1]))
categories = list(result_model.keys())
values = list(result_model.values())
color_mapping = {}
for k, v in result_model.items():
if v < 0.4:
color_mapping[k] = "Tree-Base"
else:
color_mapping[k] = "Other"
# 创建柱状图
fig = px.bar(x=categories, y=values, title='Models Performance in ln(EF)', color=color_mapping)
fig.update_layout(template="seaborn")
# 显示图表
fig.show()
# 保存柱状图为 HTML 文件
pio.write_html(fig, file=model_performance_path)
import shap
shap.initjs()
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import seaborn as sns
temp = dict(layout=go.Layout(font=dict(family="Franklin Gothic", size=12), width=800))
colors=px.colors.qualitative.Plotly
feat_importance=pd.DataFrame()
feat_importance["Importance"]=rf_model.feature_importances_
feat_importance.set_index(X_train.columns, inplace=True)
feat_importance = feat_importance.sort_values(by='Importance',ascending=True)
pal=sns.color_palette("plasma_r", 70).as_hex()[2:]
fig=go.Figure()
for i in range(len(feat_importance.index)):
fig.add_shape(dict(type="line", y0=i, y1=i, x0=0, x1=feat_importance['Importance'][i],
line_color=pal[::-1][i],opacity=0.7,line_width=4))
fig.add_trace(go.Scatter(x=feat_importance['Importance'], y=feat_importance.index, mode='markers',
marker_color=pal[::-1], marker_size=8,
hovertemplate='%{y} Importance = %{x:.5f}<extra></extra>'))
fig.update_layout(template=temp,title='Overall Feature Importance',
xaxis=dict(title='Average Importance',zeroline=False),
yaxis_showgrid=False, margin=dict(l=120,t=80),
height=700, width=800)
fig.show()
feat_importance=pd.DataFrame()
feat_importance["Importance"]=xgb_model.feature_importances_
feat_importance.set_index(X_train.columns, inplace=True)
feat_importance = feat_importance.sort_values(by='Importance',ascending=True)
pal=sns.color_palette("plasma_r", 70).as_hex()[2:]
fig=go.Figure()
for i in range(len(feat_importance.index)):
fig.add_shape(dict(type="line", y0=i, y1=i, x0=0, x1=feat_importance['Importance'][i],
line_color=pal[::-1][i],opacity=0.7,line_width=4))
fig.add_trace(go.Scatter(x=feat_importance['Importance'], y=feat_importance.index, mode='markers',
marker_color=pal[::-1], marker_size=8,
hovertemplate='%{y} Importance = %{x:.5f}<extra></extra>'))
fig.update_layout(template=temp,title='Overall Feature Importance',
xaxis=dict(title='Average Importance',zeroline=False),
yaxis_showgrid=False, margin=dict(l=120,t=80),
height=700, width=800)
fig.show()
feat_importance=pd.DataFrame()
feat_importance["Importance"]=ctb_model.feature_importances_
feat_importance.set_index(X_traint.columns, inplace=True)
feat_importance = feat_importance.sort_values(by='Importance',ascending=True)
pal=sns.color_palette("plasma_r", 70).as_hex()[2:]
fig=go.Figure()
for i in range(len(feat_importance.index)):
fig.add_shape(dict(type="line", y0=i, y1=i, x0=0, x1=feat_importance['Importance'][i],
line_color=pal[::-1][i],opacity=0.7,line_width=4))
fig.add_trace(go.Scatter(x=feat_importance['Importance'], y=feat_importance.index, mode='markers',
marker_color=pal[::-1], marker_size=8,
hovertemplate='%{y} Importance = %{x:.5f}<extra></extra>'))
fig.update_layout(template=temp,title='Overall Feature Importance',
xaxis=dict(title='Average Importance',zeroline=False),
yaxis_showgrid=False, margin=dict(l=120,t=80),
height=700, width=800)
fig.show()
feat_importance=pd.DataFrame()
feat_importance["Importance"]=lgb_model.feature_importances_
feat_importance.set_index(X_traint.columns, inplace=True)
feat_importance = feat_importance.sort_values(by='Importance',ascending=True)
pal=sns.color_palette("plasma_r", 70).as_hex()[2:]
fig=go.Figure()
for i in range(len(feat_importance.index)):
fig.add_shape(dict(type="line", y0=i, y1=i, x0=0, x1=feat_importance['Importance'][i],
line_color=pal[::-1][i],opacity=0.7,line_width=4))
fig.add_trace(go.Scatter(x=feat_importance['Importance'], y=feat_importance.index, mode='markers',
marker_color=pal[::-1], marker_size=8,
hovertemplate='%{y} Importance = %{x:.5f}<extra></extra>'))
fig.update_layout(template=temp,title='Overall Feature Importance',
xaxis=dict(title='Average Importance',zeroline=False),
yaxis_showgrid=False, margin=dict(l=120,t=80),
height=700, width=800)
fig.show()
SHAP是由Shapley value启发的可加性解释模型。对于每个预测样本,模型都产生一个预测值,SHAP value就是该样本中每个特征所分配到的数值。 很明显可以看出,与上一节中feature importance相比,SHAP value最大的优势是SHAP能对于反映出每一个样本中的特征的影响力,而且还表现出影响的正负性。
# 获取feature importance
plt.figure(figsize=(15, 5))
feat_importance=pd.DataFrame()
feat_importance["Importance"]=lgb_model.feature_importances_
feat_importance.set_index(X_traint.columns, inplace=True)
feat_importance = feat_importance.sort_values(by='Importance', ascending=False)
plt.bar(range(len(X_traint.columns)), feat_importance['Importance'])
plt.xticks(range(len(X_traint.columns)), feat_importance.index, rotation=90, fontsize=14)
plt.title('Feature importance', fontsize=14)
plt.show()
lgb_explainer = shap.TreeExplainer(lgb_model)
lgb_shap_values = lgb_explainer.shap_values(X_traint)
print(lgb_shap_values.shape)
(450, 9)
shap.force_plot(lgb_explainer.expected_value, lgb_shap_values[0,:], X_traint.iloc[0,:])
对第一个实例的特征贡献图也可用 waterfall 方式展示
shap.plots._waterfall.waterfall_legacy(
lgb_explainer.expected_value,
lgb_shap_values[0],
feature_names=X_traint.columns)
上图的解释显示了每个有助于将模型输出从基值(我们传递的训练数据集上的平均模型输出)贡献到模型输出值的特征。将预测推高的特征以红色显示,将预测推低的特征以蓝色显示。
如果我们采取许多实例来聚合显示解释,如下图所示,将它们旋转 90 度,然后将它们水平堆叠,我们可以看到整个数据集的解释(在 Notebook 中,此图是交互式的)
# visualize the training set predictions
shap.force_plot(lgb_explainer.expected_value, lgb_shap_values, X_traint)
下图中每一行代表一个特征,横坐标为SHAP值。一个点代表一个样本,颜色越红说明特征本身数值越大,颜色越蓝说明特征本身数值越小。
shap.summary_plot(lgb_shap_values, X_traint)
shap value值排序的特征重要性
shap.summary_plot(lgb_shap_values, X_traint, plot_type="bar")
SHAP也提供了部分依赖图的功能,与传统的部分依赖图不同的是,这里纵坐标不是目标变量y的数值而是SHAP值。可以观察各个特征的分布与目标shap值的关系。
fig, axes = plt.subplots(len(input_cols)//3+1, 3, figsize=(20,30))
for i, col in enumerate(input_cols):
shap.dependence_plot(col, lgb_shap_values, X_traint, interaction_index=None, show=False, ax=axes[i//3,i%3])
# shap.dependence_plot('Sand (%)', lgb_shap_values, X_traint, interaction_index=None, show=False, ax=axes[0,2])
# shap.dependence_plot('Silt (%)', lgb_shap_values, X_traint, interaction_index=None, show=False, ax=axes[1,0])
# shap.dependence_plot('Clay (%)', lgb_shap_values, X_traint, interaction_index=None, show=False, ax=axes[1,1])
# shap.dependence_plot('SOC (%)', lgb_shap_values, X_traint, interaction_index=None, show=False, ax=axes[1,2])
# shap.dependence_plot('TN (%)', lgb_shap_values, X_traint, interaction_index=None, show=False, ax=axes[2,1])
# shap.dependence_plot('C/N', lgb_shap_values, X_traint, interaction_index=None, show=False, ax=axes[2,2])
# shap.dependence_plot('pH', lgb_shap_values, X_traint, interaction_index=None, show=False, ax=axes[3,0])
# shap.dependence_plot('TN (%)', lgb_shap_values, X_traint, interaction_index=None, show=False, ax=axes[3,1])
# shap.dependence_plot('BD', lgb_shap_values, X_traint, interaction_index=None, show=False, ax=axes[3,2])
# shap.dependence_plot('CEC', lgb_shap_values, X_traint, interaction_index=None, show=False, ax=axes[4,0])
# shap.dependence_plot('N application', lgb_shap_values, X_traint, interaction_index=None, show=False, ax=axes[4,1])
# shap.dependence_plot('BNE', lgb_shap_values, X_traint, interaction_index=None, show=False, ax=axes[4,1])
# shap.dependence_plot('MAT (°C)', lgb_shap_values, X_traint, interaction_index=None, show=False, ax=axes[4,1])
# shap.dependence_plot('MAP (mm)', lgb_shap_values, X_traint, interaction_index=None, show=False, ax=axes[4,1])
我们也可以多个变量的交互作用进行分析。一种方式是采用summary_plot描绘出散点图,如下:
shap_interaction_values = shap.TreeExplainer(lgb_model).shap_interaction_values(X_traint)
plt.figure(figsize=(12,12))
shap.summary_plot(shap_interaction_values, X_traint, max_display=6)
plt.show()
<Figure size 1200x1200 with 0 Axes>
我们也可以用dependence_plot描绘两个变量交互下变量对目标值的影响。
shap.dependence_plot(input_cols[0], lgb_shap_values, X_traint, interaction_index=input_cols[1], show=False)
也能可视化每种特征对于整体预测值的影响。
shap.decision_plot(lgb_explainer.expected_value, lgb_shap_values, X_traint, ignore_warnings=True)
feat_importance=pd.DataFrame()
feat_importance["Importance"]=ctb_model.feature_importances_
feat_importance.set_index(X_traint.columns, inplace=True)
feat_importance = feat_importance.sort_values(by='Importance',ascending=True)
pal=sns.color_palette("plasma_r", 70).as_hex()[2:]
fig=go.Figure()
for i in range(len(feat_importance.index)):
fig.add_shape(dict(type="line", y0=i, y1=i, x0=0, x1=feat_importance['Importance'][i],
line_color=pal[::-1][i],opacity=0.7,line_width=4))
fig.add_trace(go.Scatter(x=feat_importance['Importance'], y=feat_importance.index, mode='markers',
marker_color=pal[::-1], marker_size=8,
hovertemplate='%{y} Importance = %{x:.5f}<extra></extra>'))
fig.update_layout(template=temp,title='Overall Feature Importance',
xaxis=dict(title='Average Importance',zeroline=False),
yaxis_showgrid=False, margin=dict(l=120,t=80),
height=700, width=800)
fig.show()